/* serpent_asm.S */
/*
    This file is part of the AVR-Crypto-Lib.
    Copyright (C) 2008  Daniel Otte (daniel.otte@rub.de)

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.
*/

/* 
 * File:        serpent_sboxes.S
 * Author:      Daniel Otte
 * Date:        2008-08-07
 * License:     GPLv3 or later
 * Description: Implementation of the serpent sbox function.
 * 
 */
 
#include <avr/io.h>
#include "avr-asm-macros.S"

/*
static void serpent_lt(uint8_t *b){
	X0 = rotl32(X0, 13);
	X2 = rotl32(X2,  3);
	X1 ^= X0 ^ X2;
	X3 ^= X2 ^ (X0 << 3);
	X1 = rotl32(X1, 1);
	X3 = rotl32(X3, 7);
	X0 ^= X1 ^ X3;
	X2 ^= X3 ^ (X1 << 7);
	X0 = rotl32(X0, 5);
	X2 = rotr32(X2, 10);
}
*/

#if 0
A0 =  4
A1 =  5
A2 =  6
A3 =  7
B0 =  8
B1 =  9
B2 = 10
B3 = 11
C0 = 12
C1 = 13
C2 = 14
C3 = 15
D0 = 16
D1 = 17
D2 = 18
D3 = 19
T0 = 20
T1 = 21
T2 = 22
T3 = 23

serpent_lt:
	push_range 4, 17
	movw r26, r24
	ld A2, X+
	ld A3, X+
	ld A0, X+
	ld A1, X+
	ldi r20, 3
	mov r0, A0
1:	
	lsr r0
	ror A3
	ror A2
	ror A1
	ror A0
	dec r20
	brne 1b
	ld B0, X+
	ld B1, X+
	ld B2, X+
	ld B3, X+
	
	ld C2, X+
	ld C3, X+
	ld C0, X+
	ld C1, X+
	ldi r20, 3
	mov r0, C0
1:	
	lsr r0
	ror C3
	ror C2
	ror C1
	ror C0
	dec r20
	brne 1b

	ld D0, X+
	ld D1, X+
	ld D2, X+
	ld D3, X+
	/* X1 ^= X0 ^ X2; */
	eor B0, A0
	eor B0, C0
	eor B1, A1
	eor B1, C1
	eor B2, A2
	eor B2, C2
	eor B3, A3
	eor B3, C3
	/* X3 ^= X2 ^ (X0 << 3); */
	mov T0, A0
	mov T1, A1
	mov T2, A2
	mov T3, A3
	ldi r24, 3
1:
	lsl T0
	rol T1
	rol T2
	rol T3
	dec r24
	brne 1b
	eor C0, B0
	eor C0, T0
	eor C1, B1
	eor C1, T1
	eor C2, B2
	eor C2, T2
	eor C3, B3
	eor C3, T3
	/*	X1 = rotl32(X1, 1); */
	mov r0, B3
	lsl r0
	rol B0
	rol B1
	rol B2
	rol B3
	/* X3 = rotl32(X3, 7); */
	mov r0, D3
	mov D3, D2
	mov D2, D1
	mov D1, D0
	mov D0, r0
	lsr r0
	ror D3
	ror D2
	ror D1
	ror D0
	/* 	X0 ^= X1 ^ X3; */
	eor A0, B0
	eor A0, D0
	eor A1, B1
	eor A1, D1
	eor A2, B2
	eor A2, D2
	eor A3, B3
	eor A3, D3
	/*  X2 ^= X3 ^ (X1 << 7); */
	mov T1, B0
	mov T2, B1
	mov T3, B2
	clr T0
	mov r0, B3
	lsr r0
	ror T2
	ror T1
	ror T0 
	eor C0, D0
	eor C0, T0
	eor C1, D1
	eor C1, T1
	eor C2, D2
	eor C2, T2
	eor C3, D3
	eor C3, T3
	/* 	X0 = rotl32(X0, 5); */
	ldi r24, 5
	mov r0, A3
1:	
	lsl r0
	rol A0
	rol A1
	rol A2
	rol A3
	dec r24
	brne 1b
	/* X2 = rotr32(X2, 10); */
	mov r0, C0
	mov C0, C1
	mov C1, C2
	mov C2, C3	
	mov C3, r0
	ldi r24, 2
1:
	lsr r0
	ror C2
	ror C1
	ror C0
	ror C3	
	dec r24
	brne 1b
	
	clr r31
	ldi r30, D3+1
	ldi r24, 16
1:
	ld r0, -Z
	st -X, r0	
	dec r24
	brne 1b
	
	pop_range 4, 17
	ret
#endif

T0 = 22
T1 = 23
T2 = 24
T3 = 25
TT = 21
/* rotate the data word (4 byte) pointed to by X by r20 bits to the right */
memrotr32:
	ld T0, X+
	ld T1, X+
	ld T2, X+
	ld T3, X+
	mov TT, T0
1:
	lsr TT
	ror T3
	ror T2
	ror T1
	ror T0
	dec r20
	brne 1b
	st -X, T3
	st -X, T2
	st -X, T1
	st -X, T0
	ret
	
/* rotate the data word (4 byte) pointed to by X by r20 bits to the left */
memrotl32:
	ld T0, X+
	ld T1, X+
	ld T2, X+
	ld T3, X+
	mov TT, T3
1:
	lsl TT
	rol T0
	rol T1
	rol T2
	rol T3
	dec r20
	brne 1b
	st -X, T3
	st -X, T2
	st -X, T1
	st -X, T0
	ret

/* xor the dataword (4 byte) pointed by Z into X */	
memeor32:
  ldi T2, 4
1:  
  ld T0, X		
  ld T1, Z+
  eor T0, T1
  st X+, T0
  dec T2
  brne 1b
  ret

serpent_lt:
	 /* X0 := X0 <<< 13 */
	movw r26, r24
	ldi r20, 7
	rcall memrotl32
	ldi r20, 6
	rcall memrotl32
	/* X2 := X2 <<< 3 */
	adiw r26, 8
	ldi r20, 3
	rcall memrotl32
	/* X1 ^= X2 */
	movw r30, r26
	sbiw r26, 4
	rcall memeor32
	/* X1 ^= X0 */
	sbiw r26, 4
	sbiw r30, 12
	rcall memeor32
	/* X3 ^= X2 */
	movw r30, r26
	adiw r26, 4
	rcall memeor32
	/* T := X0 */
	sbiw r26, 16
	ld r18, X+
	ld r19, X+
	ld r20, X+
	ld r21, X+
	/* T := T<<3 */
	ldi r22, 3
1:
	lsl r18
	rol r19
	rol r20
	rol r21
	dec r22
	brne 1b
	clr r31 
	/* X3 ^= T */
	adiw r26, 8
	ldi r30, 18
	rcall memeor32
	/* X1 := X1<<<1 */
	sbiw r26, 12
	ldi r20, 1
	rcall memrotl32
	/* X3 := X3<<<7 */
	adiw r26, 8
	ldi r20, 7
	rcall memrotl32
	/* X0 ^= X3 */
	movw r30, r26
	sbiw r26, 12
	rcall memeor32
	/* X0 ^= X1 */
	movw r30, r26
	sbiw r26, 4
	rcall memeor32
	/* X2 ^= X3 */
	adiw r26, 4
	adiw r30, 4
	rcall memeor32
	/* T := X1<<<8 */
	sbiw r26, 8
	ld r19, X+
	ld r20, X+
	ld r21, X+
	ld r18, X+
	/* T := T>>>1; T&=0xfffffff8 */
	lsr r18
	ror r21
	ror r20
	ror r19
	clr r18
	ror r18
	clr r31
	ldi r30, 18
	/* X2 ^= T */
	rcall memeor32
	/* X0 := X0 <<< 5 */
	sbiw r26, 12
	ldi r20, 5
	rcall memrotl32
	/* X3 := X3 >>> 10 */
	adiw r26, 8
	ldi r20, 7
	rcall memrotr32
	ldi r20, 3
	rcall memrotr32
	ret

serpent_inv_lt:
	 /* X0 := X0 >>> 5 */
	movw r26, r24
	ldi r20, 5
	rcall memrotr32
	/* X2 := X2 <<< 10 */
	adiw r26, 8
	ldi r20, 7
	rcall memrotl32
	ldi r20, 3
	rcall memrotl32
	/* X2 ^= X3 */
	movw r30, r26
	adiw r30, 4
	rcall memeor32
	sbiw r26, 4
	sbiw r30, 12
	/* T := X1<<7 */
	ld r19, Z+
	ld r20, Z+
	ld r21, Z+
	ld r18, Z+
	lsr r18
	ror r21
	ror r20
	ror r19
	clr r18
	ror r18
    clr r31
    /* X2 ^= T */
    ldi r30, 18
    rcall memeor32
    /* X0 ^= X1 */
    sbiw r26, 12
    movw r30, r26
    adiw r30, 4
    rcall memeor32
    /* X0 ^= X3 */
    sbiw r26, 4
    adiw r30, 4
    rcall memeor32
    /* X1 := X1>>>1 */
    ldi r20, 1
	rcall memrotr32
	/* X3 := X3>>>7 */
	adiw r26, 8
	ldi r20, 7
	rcall memrotr32
	/* X3 ^= X2 */
	sbiw r30, 8
	rcall memeor32
	sbiw r26, 4
	/* T:= X0<<3 */
	sbiw r30, 12
	ld r18, Z+
	ld r19, Z+
	ld r20, Z+
	ld r21, Z+
	ldi r24, 3
1:
	lsl r18
	rol r19
	rol r20
	rol r21
	dec r24
	brne 1b
	/* X3 ^= T */
	clr r31
	ldi r30, 18
	rcall memeor32
	/* X1 ^= X0 */
	sbiw r26, 12
	movw r30, r26
	sbiw r30, 4
	rcall memeor32
	/* X1 ^= X2 */
	movw r26, r30
	adiw r30, 4
	rcall memeor32
	/* X2 := X2 >>> 3 */
	ldi r20, 3
	rcall memrotr32
	/* X0 := X0 >>> 13 */
	sbiw r26, 8
	ldi r20, 7
	rcall memrotr32
	ldi r20, 6
	rcall memrotr32
	ret

/*
#define GOLDEN_RATIO 0x9e3779b9l

static uint32_t serpent_gen_w(uint32_t * b, uint8_t i){
	uint32_t ret;
	ret = b[0] ^ b[3] ^ b[5] ^ b[7] ^ GOLDEN_RATIO ^ (uint32_t)i;
	ret = rotl32(ret, 11);
	return ret;
}
*/
/*
 * param b is passed in r24:r25
 * param i is passed in r22
 * return value is returned in r22.r23.r24.r25
 */
 /* trashes:
  *  r20-r25, r30-r31
  */
serpent_gen_w:
	movw r30, r24
	/* ^i^b[0]*/
    ld r21, Z+
    eor r22, r21
    ld r23, Z+
    ld r24, Z+
    ld r25, Z+
    /* ^b[3]^b[5]^[b7] */
    adiw r30, 4
    ldi r20, 3
1:    
    adiw r30, 4
    ld r21, Z+
    eor r22, r21
    ld r21, Z+
    eor r23, r21
    ld r21, Z+
    eor r24, r21
    ld r21, Z+
    eor r25, r21
	dec r20
	brne 1b
	/* ^0x9e3779b9l */
	ldi r21, 0xb9
	eor r22, r21
	ldi r21, 0x79
	eor r23, r21
	ldi r21, 0x37
	eor r24, r21
	ldi r21, 0x9e
	eor r25, r21
	/* <<<11 */
	mov r21, r25
	mov r25, r24
	mov r24, r23
	mov r23, r22
	mov r22, r21
	mov r21, r25
	ldi r20, 3
1:
	lsl r21
	rol r22
	rol r23
	rol r24
	rol r25
	dec r20
	brne 1b
	ret

/*
 * void serpent_init(const void *key, uint16_t keysize_b, serpent_ctx_t *ctx)
 */
/*
 * param key     is passed in r24:r25
 * param keysize is passed in r22:r23
 * param ctx     is passed in r20:r21
 */
.global serpent_init
serpent_init:
    stack_alloc 32
    adiw r30, 1
	push_ r30, r31
    movw r26, r22
    adiw r26, 7
    tst r27
    breq 1f
	ldi r26, 32
	rjmp 2f
1:
	lsr r26
	lsr r26
	lsr r26
2:	
	mov r22, r26
	bst r22, 5 /* store in T if we have to do the "append 1 thing"*/
	ldi r27, 32
3:	/* set buffer to zero */
	st Z+, r1
	dec r27
	brne 3b
	
	movw r26, r24 /* X points to the key */
	sbiw r30, 32
	tst r22
	breq 5f /* if keylength_b==0 */
4:	/* copy keybytes to buffer */
	ld r19, X+
	st Z+, r19
	dec r22
	brne 4b
5:
	brts 7f /* if keylength_b == 256 */
	ldi r18, 0x01
	andi r22, 0x07
	brne 6f
	st Z, r18
	rjmp 7f
6:	/* shift the one to the right position */
	lsl r18
	dec r22
	brne 6b
	or r18, r19
	st -Z, r18
7: /* post "appending 1 thing" buffer is ready for subkey generation */
	movw r26, r20  /* X points to the context */
	
	pop_ r19, r18 /* r18:r19 points to the buffer */
	push r16
	clr r16
8:
	movw r24, r18
	mov  r22, r16
	rcall serpent_gen_w
	movw r30, r18
	ldi r20, 7*4
1: /* the memmove */
	ldd r0, Z+4
	st Z+, r0
	dec r20
	brne 1b
  /* store new word in buffer and context */	
	st Z+, r22
	st Z+, r23
	st Z+, r24
	st Z+, r25
	st X+, r22
	st X+, r23
	st X+, r24
	st X+, r25
	
	inc r16
	cpi r16, 132
	brne 8b	
	
	push_ r28, r29
	movw r28, r26
	subi r28, lo8(132*4)
	sbci r29, hi8(132*4)
	ldi r16, 33
2:
	movw r24, r28
	adiw r28, 16
	ldi r22, 2
	add r22, r16
	rcall sbox128
	dec r16
	brne 2b
	pop_ r29, r28, r16
	stack_free 32
	ret

/*
 * void serpent_enc(void *buffer, const serpent_ctx_t *ctx){
 */
/*
 * param buffer is passed in r24:r25
 * param ctx    is passed in r22:r23
 */
.global serpent_enc
serpent_enc:

	push_ r12, r13, r14, r15, r16 
	clr r16
	movw r14, r24
	movw r12, r22
1:
	movw r24, r14
	movw r22, r12
	ldi r20, 16
	add r12, r20
	adc r13, r1
	clr r21
	rcall memxor
	movw r24, r14
	mov r22, r16
	rcall sbox128
	movw r24, r14
	rcall serpent_lt
	
	inc r16
	cpi r16, 31
	brne 1b
	
	movw r24, r14
	movw r22, r12
	ldi r20, 16
	add r12, r20
	adc r13, r1
	clr r21
	rcall memxor
	movw r24, r14
	mov r22, r16
	rcall sbox128
	
	inc r16
	movw r24, r14
	movw r22, r12
	ldi r20, 16
	clr r21
	pop_ r16, r15, r14, r13, r12
	rjmp memxor

/*
 * void serpent_dec(void *buffer, const serpent_ctx_t *ctx){
 */
/*
 * param buffer is passed in r24:r25
 * param ctx    is passed in r22:r23
 */
.global serpent_dec
serpent_dec:
	push_ r12, r13, r14, r15, r16 
	movw r14, r24
//	ldi r16, lo8(32*16)
//	add r22, r16
	ldi r16, hi8(32*16)
	add r23, r16
	movw r12, r22
	ldi r20, 16
	clr r21
	rcall memxor
	
	movw r24, r14
	ldi r22, 31
	call inv_sbox128
	
	movw r24, r14
	ldi r20, 16
	sub r12, r20
	sbc r13, r1
	movw r22, r12
	clr r21
	rcall memxor
	ldi r16, 31
1:
	dec r16
	movw r24, r14
	rcall serpent_inv_lt
	movw r24, r14
	mov r22, r16
	rcall inv_sbox128
	movw r24, r14
	ldi r20, 16
	sub r12, r20
	sbc r13, r1
	movw r22, r12
	clr r21
	rcall memxor
	
	tst r16
	brne 1b
	pop_ r16, r15, r14, r13, r12
	ret	
	
	
	














